/* This dofile is to match tvet admission data for grade 10 students with the current database */

// import dataset, clean and separate for grade 10 and grade 12
import excel $noncbt, clear firstrow 
drop M N O
ren nac_status_name app_status
tempfile file1
save `file1', replace 

import excel $cbt, clear firstrow 
g a_CBT = 1
lab var a_CBT "whether the data is from CBT-trainees" 
append using `file1'

ren app_name name
ren cid_no citizenid
ren dob dateofbirth

rename *, lower   

g i_driving_tvetadmin = 0
replace i_driving_tvetadmin = 1 if strpos(institute_name, "Driving")>0
lab var i_driving_tvetadmin "0/1 whether the institute is driving center in the tvet admin data"

g i_driving_private = 0
replace i_driving_private = 1 if i_driving_tvetadmin == 1 | a_cbt == 1
lab var i_driving_private "0/1 whether the institute is driving center or private institute in the tvet admin data"


replace citizenid = "" if length(citizenid)~=11
replace citizenid = "" if citizenid == "UNDER PROCE"
destring citizenid, replace

tostring citizenid, replace format("%20.0f")

replace institute_name = trim(itrim(lower(institute_name)))
tab institute_name

replace name = trim(itrim(lower(name)))
replace name = subinstr(name," ","",.)  
replace name = subinstr(name,",","",.)   //Removes comma (,)
replace name = subinstr(name,"'","",.)   //Removes apostrophe (')
replace name = subinstr(name,".","",.)   //Removes dot (.) 
replace name = subinstr(name,"/","",.)   //Removes slash (/)
replace name = subinstr(name,"-","",.)   //Removes dash (-)
replace name = subinstr(name,"=","",.)   //Removes dash (-)
replace name = subinstr(name,"(","",.)   //Removes opening parentheses
replace name = subinstr(name,")","",.)   //Removes closing parentheses

gen dateofbirth1 = date(dateofbirth, "MDY")
format dateofbirth1 %td
g b_month = month(dateofbirth1)
g b_day = day(dateofbirth1)
g b_year = year(dateofbirth1)

g sex = 1 if gender == "M"
replace sex = 2 if gender == "F"
replace sex = 3 if gender == "\N"

drop dateofbirth dateofbirth1

g double dateofbirth = b_year*10^4 + b_month*10^2 + b_day
format dateofbirth %20.0f 
tostring dateofbirth, replace 

format course_start_date %td
g app_year = year(course_start_date)
lab var app_year "year of application and start of course"

g a_admtvet = "Yes"
replace a_admtvet = "No" if app_status == "Not Selected"
replace a_admtvet = "N/A" if app_status == "\N"
lab var a_admtvet "got admission to TVET (administrative data)"

g a_adm_tvet = regexm(a_admtvet, "Yes")
lab var a_adm_tvet "admission to TVET (administrative data)"

g a_apptvet = "Yes"
lab var a_apptvet "application to TVET (administrative data)"

g a_app_tvet = regexm(a_apptvet, "Yes")
lab var a_app_tvet "application to TVET (administrative data)"

g a_adm_tvet_dr = a_adm_tvet
replace a_adm_tvet_dr = . if i_driving_tvetadmin == 1
lab var a_adm_tvet_dr "admission to TVET (administrative data w/o driving centers)"

g a_adm_tvet_drpr = a_adm_tvet
replace a_adm_tvet_drpr = . if i_driving_private == 1
lab var a_adm_tvet_drpr "admission to TVET (administrative data w/o driving centers and private institutes)"


g a_app_tvet_dr = a_app_tvet
replace a_app_tvet_dr = . if i_driving_tvetadmin == 1
lab var a_app_tvet_dr "application to TVET (administrative data w/o driving centers)"

g a_app_tvet_drpr = a_app_tvet
replace a_app_tvet_drpr = . if i_driving_private == 1
lab var a_app_tvet_drpr "application to TVET (administrative data w/o driving centers and private institutes)"

// keep data from 2022 
keep if app_year == 2022

// for students who applied several times, keep the outcomes of the latest application
sort institute_name name citizenid sex a_cbt course_start_date
duplicates tag institute_name name citizenid sex , g(temp)
br institute_name name citizenid sex if temp~=0

g dup = 0
forval i=1/3 {
replace dup = 1 if institute_name == institute_name[_n+`i'] & name == name[_n+`i'] & sex == sex[_n+`i'] & citizenid==citizenid[_n+`i']
}

keep if dup == 0
drop temp

foreach variable in name citizenid dateofbirth sex b_month b_day b_year {
	rename `variable' `variable'_t 
}

keep if inlist(qualification_name, "Class X", "Class X and below") & a_admtvet == "Yes"
g tvetindexnumber = _n
save "$temp/tvetadm_grade10.dta", replace 


********************************************************************************
*
*			MERGE WITH GRADE 10 SURVEY DATA 
*
********************************************************************************

// STEP 1: PERFECT MATCHING ON CITIZEN ID + DOUBLE CHECK NAME & BIRTHDAY
********************************************************************************
gl criteria citizenid  

* open the tvet choice data 
use "$temp/tvetadm_grade10.dta", clear 
rename citizenid_t citizenid 

duplicates tag $criteria, g(temp)
tab temp

* obtain the subset of unique citizenid
preserve 
keep if temp==0
drop temp
save $temp/tvet_step1u, replace 
restore 

* keep the duplicated for next round match
keep if temp~=0
drop temp
cap rename citizenid citizenid_t
save $temp/tvet_step1d, replace


* open the survey data 
use "$temp/stem_analysis.dta", clear
replace name = subinstr(name," ","",.)  
replace sex = 2 if sex == 0 
format citizenid %20s

generate str citizenid_st = citizenid
replace citizenid = ""
compress citizenid
replace citizenid = citizenid_st
drop citizenid_st
describe citizenid

save "$temp/data10.dta", replace 


use "$temp/data10.dta", clear

duplicates tag $criteria, g(temp)

* obtain the subset of unique citizenid
preserve 
keep if temp==0
drop temp
save $temp/data10_step1u, replace 
restore 

* keep the duplicated for next round match
keep if temp~=0
drop temp
rename citizenid citizenid_t
save $temp/data10_step1d, replace

* merging two unique data together using fully matched citizenid ID & check fuzzy name 
use $temp/tvet_step1u, clear
merge 1:1 $criteria using $temp/data10_step1u
matchit name_t name, g(namescore)
matchit dateofbirth_t dateofbirth, g(bscore)
br namescore bscore if _merge==3
sum namescore if _merge==3

* correct if names are not matched, leave for the next round of matching
replace _merge = . if (namescore <= 0.5 & bscore < 0.8) & _merge == 3

* keep matched data 
preserve 
keep if _merge==3 
keep tvetindexnumber uniqueid 
unique tvetindexnumber
save $temp/tvet_step1m, replace 
restore 

* keep unmatched data from score database, merge with the duplicated from previous step
preserve 
keep if _merge==1 | _merge==.
keep tvetindexnumber 
merge 1:1 tvetindexnumber using $temp/tvetadm_grade10.dta, keep(matched) nogen 
append using $temp/tvet_step1d
save $temp/tvet_step2b, replace 
restore 

preserve 
keep if _merge==2 | _merge==.
keep responseid_bl schoolname_bl 
merge 1:1 responseid_bl schoolname_bl using $temp/data10.dta, keep(matched) nogen 
append using $temp/data10_step1d
save $temp/data10_step2b, replace 
restore 




// STEP 2: PERFECT MATCHING ON NAME SEX DOB + DOUBLE CHECK CID: NO RESULTS 
********************************************************************************


// combine data together 
********************************************************************************
clear 
append using $temp/tvet_step1m
g a_tvet_match_criteria = "perfect(CID) + fuzzy(name/dob)"

merge 1:1 tvetindexnumber using "$temp/tvetadm_grade10.dta", keep(matched) keepusing(a_*) nogen 
drop tvetindexnumber
g a_adm_data = 1 
lab var a_adm_data "1/0 tvet admission data available"

save "$temp/grade10_analysis_tvetadm.dta", replace 

